###############################################################################
#Title    : final_analysis_immigration_fuel_populism.R
#Goal     : This is a R code script to run the analysis of
#           "Does Immigration Fuel Populism?: The Effect of Immigration on the Rise of Populist Rhetoric"
#Date     : Nov 27th, 2025
#Inputs   : # final_data_gasd.rds
#Outputs  : # germany_storage.rds
            # final_germany_stm_model.Rdata
            # austria_storage.rds
            # final_austria_stm_model.Rdata
            # sweden_storage.rds
            # final_sweden_stm_model.Rdata
            # denmark_storage.rds
            # final_denmark_stm_model.Rdata
            # sentiment_analysis_python.rds
###############################################################################

# 1. set-up ====================================================================
  rm(list=ls())
  
  library(pacman)
  p_load(
    tidyverse,
    stm,
    tm,
    gridExtra,
    tidystm,
    quanteda,
    SnowballC,
    scales,
    stopwords,
    patchwork
  )  

# 2. Descriptive Analysis =======================================================
  data_total<-readRDS("final_data_gasd.rds")
  data_total<-data_total %>% rename(country = country_name)
  
  #country
  table(data_total$country)
  
  #Agenda
  sum(is.na(data_total[data_total$country=="Austria",]$agenda))
  sum(is.na(data_total[data_total$country=="Denmark",]$agenda))
  sum(is.na(data_total[data_total$country=="Germany",]$agenda))
  sum(is.na(data_total[data_total$country=="Sweden",]$agenda))

  
  #Party
  table(data_total[data_total$country=="Austria",]$party)
  table(data_total[data_total$country=="Germany",]$party)
  table(data_total[data_total$country=="Denmark",]$party)
  table(data_total[data_total$country=="Sweden",]$party)
  
  #populism
  table(data_total[data_total$country=="Austria",]$party, data_total[data_total$country=="Austria",]$populist)
  table(data_total[data_total$country=="Germany",]$party, data_total[data_total$country=="Germany",]$populist)
  table(data_total[data_total$country=="Denmark",]$party, data_total[data_total$country=="Denmark",]$populist)
  table(data_total[data_total$country=="Sweden",]$party,  data_total[data_total$country=="Sweden",]$populist)
  
# 3. Frequency of immigrant-speech =============================================
  data_prop <- data_total %>%
    group_by(country, party) %>%
    summarize(
      party_total_speech = n(),
      party_immigration_speech = sum(immigration_agenda),
      party_immi_speech_prop = party_immigration_speech / party_total_speech,
      se = sqrt(party_immi_speech_prop * (1 - party_immi_speech_prop) / party_total_speech),
      lower_limit = party_immi_speech_prop - 1.96 * se,
      upper_limit = party_immi_speech_prop + 1.96 * se,
      left_right = mean(left_right, na.rm = TRUE),
      left_right_ches = mean(left_right_ches, na.rm = TRUE),
      populist = mean(populist, na.rm = TRUE)
    ) %>% ungroup()
  
  data_prop<-data_prop %>% filter(party_immigration_speech!=0)
  
  # Distribution over party
  austria_party<-data_prop %>% filter(country == "Austria")
  germany_party<-data_prop %>% filter(country == "Germany")
  denmark_party<-data_prop %>% filter(country == "Denmark")
  sweden_party<-data_prop %>% filter(country == "Sweden")
  
  austria_hist <- ggplot(austria_party, aes(x = reorder(party, party_immi_speech_prop), y = party_immi_speech_prop)) +
    geom_bar(stat = "identity", aes(fill = as.factor(populist)), position = "dodge") +
    geom_errorbar(aes(ymin = lower_limit, ymax = upper_limit), width = 0.2, position = position_dodge(0.9)) +
    scale_fill_manual(values = c("gray80", "red"), labels = c("Non-populist", "Populist")) +
    labs(x = "Austria", y = "", title = "",
         fill = "Type") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
          legend.position = "bottom")
  
  germany_hist<-ggplot(germany_party, aes(x = reorder(party, party_immi_speech_prop), y = party_immi_speech_prop)) +
    geom_bar(stat = "identity", aes(fill = as.factor(populist)), position = "dodge") +
    geom_errorbar(aes(ymin = lower_limit, ymax = upper_limit), width = 0.2, position = position_dodge(0.9)) +
    scale_fill_manual(values = c("gray80", "red"), labels = c("Non-populist", "Populist")) +
    labs(x = "Germany", y = "", title = "",
         fill = "Type") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
          legend.position = "bottom")
  
  denmark_hist<-ggplot(denmark_party, aes(x = reorder(party, party_immi_speech_prop), y = party_immi_speech_prop)) +
    geom_bar(stat = "identity", aes(fill = as.factor(populist)), position = "dodge") +
    geom_errorbar(aes(ymin = lower_limit, ymax = upper_limit), width = 0.2, position = position_dodge(0.9)) +
    scale_fill_manual(values = c("gray80", "red"), labels = c("Non-populist", "Populist")) +
    labs(x = "Denmark", y = "", title = "",
         fill = "Type") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
          legend.position = "bottom")
  
  sweden_hist<-ggplot(sweden_party, aes(x = reorder(party, party_immi_speech_prop), y = party_immi_speech_prop)) +
    geom_bar(stat = "identity", aes(fill = as.factor(populist)), position = "dodge") +
    geom_errorbar(aes(ymin = lower_limit, ymax = upper_limit), width = 0.2, position = position_dodge(0.9)) +
    scale_fill_manual(values = c("gray80", "red"), labels = c("Non-populist", "Populist")) +
    labs(x = "Sweden", y = "", title = "",
         fill = "Type") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
          legend.position = "bottom")
  
  ggsave("graph/Proportion_Party_Austria.png",austria_hist, width = 10, height = 8, dpi = 300)
  ggsave("graph/Proportion_Party_Germany.png",germany_hist, width = 10, height = 8, dpi = 300)
  ggsave("graph/Proportion_Party_Denmark.png",denmark_hist, width = 10, height = 8, dpi = 300)
  ggsave("graph/Proportion_Party_Sweden.png", sweden_hist,  width = 10, height = 8, dpi = 300)
  
  # By populism
  data_popul_prop <- data_total %>%
    group_by(country, populist) %>%
    summarize(party_total_speech = n(),
              party_immigration_speech = sum(immigration_agenda),
              party_immi_speech_prop = party_immigration_speech / party_total_speech,
              se = sqrt(party_immi_speech_prop * (1 - party_immi_speech_prop) / party_total_speech),
              lower_limit = party_immi_speech_prop - 1.96 * se,
              upper_limit = party_immi_speech_prop + 1.96 * se) %>% ungroup()
  
  # Split the data by country
  austria_popul_party<-data_popul_prop %>% filter(country == "Austria")
  germany_popul_party<-data_popul_prop %>% filter(country == "Germany")
  denmark_popul_party<-data_popul_prop %>% filter(country == "Denmark")
  sweden_popul_party<-data_popul_prop %>% filter(country == "Sweden")
  
  base_theme <- theme_bw() +
    theme(
      plot.title = element_text(hjust = 0.5),
      axis.title.y = element_blank(),
      axis.text.x = element_blank(),
      legend.position = "none"
    )
  y_scale1 <- scale_y_continuous(limits = c(0, 0.008))
  y_scale2 <- scale_y_continuous(limits = c(0, 0.035))
  
  austria_hist <- ggplot(austria_popul_party, aes(x =populist, y = party_immi_speech_prop)) +
    geom_bar(stat = "identity", aes(fill = as.factor(populist)), position = "dodge") +
    geom_errorbar(aes(ymin = lower_limit, ymax = upper_limit), width = 0.2, position = position_dodge(0.9)) +
    scale_fill_manual(values = c("gray80", "red"), labels = c("Non-populist", "Populist")) + y_scale1 +
    labs(x = "Austria", y = "", title = "",
         fill = "Type") +
    theme_bw() +
    base_theme
  
  germany_hist<- ggplot(germany_popul_party, aes(x =populist, y = party_immi_speech_prop)) +
    geom_bar(stat = "identity", aes(fill = as.factor(populist)), position = "dodge") +
    geom_errorbar(aes(ymin = lower_limit, ymax = upper_limit), width = 0.2, position = position_dodge(0.9)) +
    scale_fill_manual(values = c("gray80", "red"), labels = c("Non-populist", "Populist")) + y_scale1 +
    labs(x = "Germany", y = "", title = "",
         fill = "Type") +
    theme_bw() +
    base_theme
  
  denmark_hist<- ggplot(denmark_popul_party, aes(x =populist, y = party_immi_speech_prop)) +
    geom_bar(stat = "identity", aes(fill = as.factor(populist)), position = "dodge") +
    geom_errorbar(aes(ymin = lower_limit, ymax = upper_limit), width = 0.2, position = position_dodge(0.9)) +
    scale_fill_manual(values = c("gray80", "red"), labels = c("Non-populist", "Populist")) + y_scale2 +
    labs(x = "Denmark", y = "", title = "",
         fill = "Type") +
    theme_bw() +
    base_theme
  
  sweden_hist<- ggplot(sweden_popul_party, aes(x =populist, y = party_immi_speech_prop)) +
    geom_bar(stat = "identity", aes(fill = as.factor(populist)), position = "dodge") +
    geom_errorbar(aes(ymin = lower_limit, ymax = upper_limit), width = 0.2, position = position_dodge(0.9)) +
    scale_fill_manual(values = c("gray80", "red"), labels = c("Non-populist", "Populist")) + y_scale2 +
    labs(x = "Sweden", y = "", title = "",
         fill = "Type") +
    theme_bw() +
    base_theme

  combined_plot <- (austria_hist | denmark_hist) /
    (germany_hist | sweden_hist) +
    plot_layout(guides = "collect") &
    theme(legend.position = "bottom")
  
  # Display the combined plot
  combined_plot
  
  ggsave("graph/Proporttion_Populism_Comparison.png",combined_plot, width = 10, height = 8, dpi = 300)
 
# 4. Topic Modeling ============================================================
  rm(list=ls())
  data_total<-readRDS("final_data_gasd.rds")
  data_total<-data_total %>% rename(country = country_name,
                                    raw_text = text)
  
  plot.searchK<-function(x, ...){
    oldpar <- par(no.readonly=TRUE)
    g <- x$results
    par(mfrow=c(2,2),mar=c(4,4,4,4),oma=c(2,2,2,2))
    
    plot(g$K,g$heldout,type="p", main="Held-Out Likelihood", xlab="Number of Topics (K)", ylab="Held-Out Likelihood")
    lines(g$K,g$heldout,lty=1,col=1)
    
    plot(g$K,g$residual,type="p", main="Residuals", xlab="Number of Topics (K)", ylab="Residuals")
    lines(g$K,g$residual,lty=1,col=1 )
    
    if(!is.null(g$semcoh)){
      plot(g$K,g$semcoh,type="p", main="Semantic Coherence", xlab="Number of Topics (K)", ylab="Semantic Coherence")
      lines(g$K,g$semcoh,lty=1,col=1 ) 
    }
    
    plot(g$K,g$exclus,type="p", main="Exclusivity", xlab="Number of Topics (K)", ylab="Exclusivity")
    lines(g$K,g$exclus,lty=1,col=1 )
    
    #plot(g$K,g$bound,type="n", main="Bound", xlab="Number of Topics (K)", ylab="Bound")
    #lines(g$K,g$bound,lty=1,col=1 ) 
    
    # plot(g$K,g$lbound,type="p", main="Lower Bound", xlab="Number of Topics (K)", ylab="Lower Bound")
    # lines(g$K,g$lbound,lty=1,col=1 ) 
    
    #title("Diagnostic Values by Number of Topics", outer=TRUE)  
    par(oldpar)
  }

## 4.1 STM: Germany ============================================================    
  data_stm_germany <- data_total %>% filter(country=="Germany", immigration_agenda==1) 
  data_stm_germany <- data_stm_germany %>% 
    dplyr::select(country, election_id, party, speaker, year, 
                  populist, total_immi_prop, left_right, immigration_agenda, raw_text) %>% 
    mutate(docid = row_number())
  
  table(data_stm_germany$populist)
  
  corp <- corpus(data_stm_germany, text_field = "raw_text", docid_field = "docid", meta = TRUE)
  docvars(corp)$raw_text <- as.character(corp) # for stm (findThoughts)
  
  tokens <- tokens(corp,
                   what = "word",
                   remove_punct = TRUE, # removes punctuation
                   remove_symbols = TRUE, # removes unicode symbols
                   remove_numbers = TRUE, # removes numbers but not words starting with digits
                   remove_url = TRUE, # eliminates URLs beginning with http/https
                   remove_separators = TRUE, # removes separators as categorized in unicode
                   include_docvars = TRUE) # whether to pass on the docvars to the tokens object
  
  #Lower case
  doc_term_matrix <- dfm(tokens, tolower = TRUE)
  
  #Remove stopwords
  doc_term_matrix  <- dfm_select(doc_term_matrix, 
                                 pattern = stopwords("de", source = "snowball"), 
                                 selection = "remove")
  
  #stem
  doc_term_matrix <- dfm_wordstem(doc_term_matrix)
  
  #trim to focus on common, but distinguishing features
  doc_term_matrix <- dfm_trim(doc_term_matrix, min_docfreq = 0.075,
                              max_docfreq = 0.90,
                              docfreq_type = "prop")
  
  #STM
    #filter out empty documents
    doc_term_matrix <- doc_term_matrix[ntoken(doc_term_matrix) > 0, ]
    doc_count <- ndoc(doc_term_matrix)
  
  out<-quanteda::convert(doc_term_matrix, to="stm")
  
  #Number of K
  # storage <- searchK(documents = out$documents,
  #                    vocab = out$vocab,
  #                    data = out$meta,
  #                    K=c(5,10,15,20,25,30,35,40,45,50), #Can Change
  #                    prevalence =~ party + speaker + s(year)+ populist + total_immi_prop + left_right,
  #                    max.em.its = 500, gamma.prior = "L1",
  #                    init.type = "Spectral", seed=9999, verbose =TRUE)
  # 
  # saveRDS(storage, "stm_model/germany_storage.rds")
  
    #Visualization
    storage <-readRDS("stm_model/germany_storage.rds")
    png("graph/searchK_Germany.png", width = 800, height = 600)
    plot(storage)
    dev.off()
  
  #K=20 Looks Good!
    # germany_stm <- stm(documents = out$documents,
    #                    vocab = out$vocab,
    #                    data = out$meta,
    #                    K=20,
    #                    prevalence =~ party + speaker + s(year)+ populist + total_immi_prop + left_right,
    #                    max.em.its = 500, gamma.prior = "L1",
    #                    init.type = "Spectral", seed=9999, verbose =TRUE)
    # save(germany_stm, file = "stm_model/final_germany_stm_model.Rdata")
  load("stm_model/final_germany_stm_model.Rdata")
  
  #Topic Distribution
  labelTopics(germany_stm)
    
  #Key examples
  texts <-findThoughts(germany_stm, texts = out$meta$raw_text, n=3, thresh = 0.1)
  example_texts<-texts[["docs"]]
  example_texts
  
  #Topic 4,6,7,9,12,14,20 is relevant
  prep_germnay<-estimateEffect(1:20~ populist+total_immi_prop + left_right,
                       germany_stm, 
                       meta=out$meta, uncertainty="Global")
  summary(prep_germnay, topics=seq(1,20,by=1), nsim=1000)
  summary(prep_germnay, topics=c(4,6,7,9,12,14,20), nsim=1000)
  
  effect_germany<-extract.estimateEffect(prep_germnay, "populist",
                                         cov.value1 = "1",cov.value2 = "0", 
                                         model = germany_stm, method = "difference") 
  effect_germany <- effect_germany %>% filter(topic %in% c(4,6,7,9,12,14,20))
  
  effect_germany <- effect_germany %>% dplyr::mutate(country = "Germany",
                                                     topic_name = c("International Support for the Syrian Refugees",
                                                                    "Critique of Immigration Policies",
                                                                    "Protection of Women and Children Refugees",
                                                                    "Critique of Asylum Procedures",
                                                                    "Migration Pact vs National Sovereignty",
                                                                    "Selective Immigration & Immigrants' Responsibility",
                                                                    "EU Refugee Burden-sharing"))
  
  prep_germnay2<-estimateEffect(1:20~ populist* total_immi_prop +left_right,
                                germany_stm, 
                                meta=out$meta, uncertainty="Global")
  summary(prep_germnay2, topics=c(4,6,7,9,12,14,20), nsim=1000)
  
  out$meta<-out$meta %>% mutate(dummy = ifelse(year %in% c(2015,2016), 1, 0))
  prep_germnay3<-estimateEffect(1:20~ populist* total_immi_prop +left_right + dummy,
                                germany_stm, 
                                meta=out$meta, uncertainty="Global")
  summary(prep_germnay3, topics=c(4,6,7,9,12,14,20), nsim=1000)
  
  for(k in c(3,2)){
    
    current_prep <- if(k == 2) prep_germnay2 else prep_germnay3
    
    effect_germany_int <- lapply(c(0,1), function(i) {
      extract.estimateEffect(x = current_prep,
                             covariate = "total_immi_prop",
                             method = "continuous",
                             model = germany_stm,
                             moderator = "populist",
                             moderator.value = i)
    })
    
    effect_germany_int <- do.call("rbind", effect_germany_int) %>%
      filter(topic %in% c(4,6,7,9,12,14,20)) %>%
      mutate(country = "Germany",
             topic_name = case_when(
               topic == 4 ~ "International Support for the Syrian Refugees",
               topic == 6 ~ "Critique of Immigration Policies",
               topic == 7 ~ "Protection of Women and Children Refugees",
               topic == 9 ~ "Critique of Asylum Procedures",
               topic == 12 ~ "Migration Pact vs National Sovereignty",
               topic == 14 ~ "Selective Immigration & Immigrants' Responsibility",
               topic == 20 ~ "EU Refugee Burden-sharing"
             ))
    
    fig1 <- effect_germany_int %>%
      ggplot(aes(x = covariate.value, y = estimate, ymin = ci.lower, ymax = ci.upper, 
                 group = factor(moderator.value), fill = factor(moderator.value))) +
      facet_wrap(~ reorder(topic_name, topic), ncol = 2, labeller = labeller(topic_name = label_wrap_gen(width = 50))) +
      geom_ribbon(alpha = 0.5) +
      geom_line(aes(y = estimate), size = 1) +
      geom_line(aes(y = ci.lower), linetype = "dashed") +
      geom_line(aes(y = ci.upper), linetype = "dashed") +
      xlab("Immigration Level (Germany)") +
      ylab("Estimated Effect") +
      scale_fill_discrete(name = "Populism", labels = c("Non-populist", "Populist")) +
      theme_bw() +
      theme(strip.text = element_text(size = 10),
            legend.position = "bottom",
            legend.direction = "horizontal")
    
    ggsave(filename = paste0("graph/germany_interaction_effect_topic", k, ".png"),
           plot = fig1, width = 16, height = 12, dpi = 300)
  }
  
  #Topic proportion for each speeches
    relavant_topics <- c(4,6,7,9,12,14,20)
    a<-germany_stm$theta   # document-topic distribution
    colnames(a)<-paste0("Topic ", seq(1:20))
    # Multiply all topic columns by 100 and calculate total
    topic.doc <- as.data.frame(a) %>%
      mutate(total = rowSums(., na.rm = TRUE))               # Sum all columns
    topic.doc <- topic.doc*100
    
    # Sum the relevant topics
    topic.doc$sum_relevant_topic <- rowSums(topic.doc[, relavant_topics])
    topic.doc <- topic.doc %>% mutate(sum_irrelevant_topic = 100-sum_relevant_topic)
    
    # Bind text and topic.doc, with an "all" column set to TRUE
    response_texts <- cbind(out$meta, topic.doc)
    german_response_texts <-response_texts
  
## 4.2 STM: Austria ============================================================
  data_stm <- data_total %>% filter(country=="Austria", immigration_agenda==1) 
  data_stm <- data_stm %>% 
    dplyr::select(country, election_id, party, speaker, year, 
                  populist, total_immi_prop, left_right, immigration_agenda, raw_text) %>% 
    mutate(docid = row_number())
  data_stm <- drop_na(data_stm)
  
  table(data_stm$populist)

  corp <- corpus(data_stm, text_field = "raw_text", docid_field = "docid", meta = TRUE)
  docvars(corp)$raw_text <- as.character(corp) # for stm (findThoughts)
  
  tokens <- tokens(corp,
                   what = "word",
                   remove_punct = TRUE, # removes punctuation
                   remove_symbols = TRUE, # removes unicode symbols
                   remove_numbers = TRUE, # removes numbers but not words starting with digits
                   remove_url = TRUE, # eliminates URLs beginning with http/https
                   remove_separators = TRUE, # removes separators as categorized in unicode
                   include_docvars = TRUE) # whether to pass on the docvars to the tokens object
  
  #Lower case
  doc_term_matrix <- dfm(tokens, tolower = TRUE)
  
  #Remove stopwords
  doc_term_matrix  <- dfm_select(doc_term_matrix, 
                                 pattern = stopwords("de", source = "snowball"), 
                                 selection = "remove")
  
  #stem
  doc_term_matrix <- dfm_wordstem(doc_term_matrix)
  
  #trim to focus on common, but distinguishing features
  doc_term_matrix <- dfm_trim(doc_term_matrix, min_docfreq = 0.075,
                              max_docfreq = 0.90,
                              docfreq_type = "prop")
  
  #STM
  #filter out empty documents
  doc_term_matrix <- doc_term_matrix[ntoken(doc_term_matrix) > 0, ]
  doc_count <- ndoc(doc_term_matrix)
  
  out<-quanteda::convert(doc_term_matrix, to="stm")
  
  #Number of K
  # storage <- searchK(documents = out$documents,
  #                    vocab = out$vocab,
  #                    data = out$meta,
  #                    K=c(5,10,15,20,25,30,35,40,45,50), #Can Change
  #                    prevalence =~ party + speaker + s(year)+ populist + total_immi_prop + left_right,
  #                    max.em.its = 500,
  #                    init.type = "Spectral", seed=9999, verbose =TRUE)
  # saveRDS(storage, "stm_model/austria_storage.rds")
  
    #Visualization
    storage <-readRDS("stm_model/austria_storage.rds")
    png("graph/searchK_Austria.png", width = 800, height = 600)
    plot(storage)
    dev.off()
    
  #K=20 Looks good
    # stm_model <- stm(documents = out$documents,
    #                    vocab = out$vocab,
    #                    data = out$meta,
    #                    K=20,
    #                    prevalence =~ party + speaker + s(year)+ populist + total_immi_prop + left_right,
    #                    max.em.its = 500,
    #                    init.type = "Spectral", seed=9999, verbose =TRUE)
    # save(stm_model, file = "stm_model/final_austria_stm_model.Rdata")
    load("stm_model/final_austria_stm_model.Rdata")
  
    #Topic Distribution
    labelTopics(stm_model)
    plot(stm_model, type = "summary", xlim=c(0,0.3))
    
    #Key examples
    texts <-findThoughts(stm_model, texts = out$meta$raw_text, n=2, thresh = 0.1)
    example_texts<-texts[["docs"]]
    example_texts
    
    #Topic 7, 10, 12, 15, 16, 17, 19  is relevant
    prep_austria<-estimateEffect(1:20~ populist+total_immi_prop+left_right,
                                 stm_model, 
                                 meta=out$meta, uncertainty="Global")
    summary(prep_austria, topics=c(7,10,12,15,16,17,19), nsim=1000)
    
    effect_austria<-extract.estimateEffect(prep_austria, "populist",
                                           cov.value1 = "1",cov.value2 = "0", 
                                           model = stm_model, method = "difference") 
    effect_austria <- effect_austria %>% filter(topic %in% c(7,10,12,15,16,17,19))  
    
    effect_austria <- effect_austria %>% dplyr::mutate(country = "Austria",
                                                       topic_name = c("Asylum abuse & Crime",
                                                                      "Protection of Women & Child Asylum Seekers",
                                                                      "Threat to National Society",
                                                                      "Accountability & Human Rights in Law Enforcement",
                                                                      "Integration & Responsibility of Immigrants",
                                                                      "European Solidarity vs. National Sovereignty",
                                                                      "International cooperation for Refugees"))
    
    prep_austria2<-estimateEffect(1:20~ populist*total_immi_prop+left_right,
                                  stm_model, 
                                  meta=out$meta, uncertainty="Global")
    summary(prep_austria2, topics=c(7,10,12,15,16,17,19), nsim=1000)
    
    out$meta<-out$meta %>% mutate(dummy = ifelse(year %in% c(2015,2016), 1, 0))
    prep_austria3<-estimateEffect(1:20~ populist* total_immi_prop +left_right + dummy,
                                  stm_model, 
                                  meta=out$meta, uncertainty="Global")
    summary(prep_austria3, topics=c(7,10,12,15,16,17,19), nsim=1000)
    
    for(k in c(3,2)){
      current_prep <- if(k == 2) prep_austria2 else prep_austria3
      
      effect_austria2 <- lapply(c(0,1), function(i) {
        extract.estimateEffect(x = current_prep,
                               covariate = "total_immi_prop",
                               method = "continuous",
                               model = stm_model,
                               moderator = "populist",
                               moderator.value = i)
      })
      
      effect_austria2 <- do.call("rbind", effect_austria2)
      effect_austria2 <- effect_austria2 %>% filter(topic %in% c(7,10,12,15,16,17,19))
      effect_austria2 <- effect_austria2 %>% dplyr::mutate(country = "Austria",
                                                           topic_name = case_when(
                                                             topic ==7  ~ "Asylum abuse & Crime",
                                                             topic ==10 ~ "Protection of Women & Child Asylum Seekers",
                                                             topic ==12 ~ "Threat to National Society",
                                                             topic ==15 ~ "Accountability & Human Rights in Law Enforcement",
                                                             topic ==16 ~ "Integration & Responsibility of Immigrants",
                                                             topic ==17 ~ "European Solidarity vs. National Sovereignty",
                                                             topic ==19 ~ "International Cooperation for Refugees"))
      
      fig2<- effect_austria2 %>% ggplot(aes(x = covariate.value, y = estimate, ymin = ci.lower, ymax = ci.upper, 
                                            group = moderator.value, fill = factor(moderator.value)))+
        facet_wrap(~ reorder(topic_name,topic), ncol = 2, labeller = labeller(topic_name = label_wrap_gen(width = 50)))+
        geom_ribbon(alpha = .5) +
        geom_line(color = "red") +
        geom_line(aes(x = covariate.value, y = ci.lower), linetype="dashed")+
        geom_line(aes(x = covariate.value, y = ci.upper), linetype="dashed")+
        xlab("Austria")+
        ylab("")+
        scale_fill_discrete(name = "Populism", labels = c("Non-populist", "Populist"))+
        theme_bw()+
        theme(strip.text = element_text(size = 10),
              legend.position = "bottom",
              legend.direction = "horizontal")
      ggsave(filename = paste0("graph/austria_interaction_effect_topic", k, ".png"), 
             plot = fig2, width = 16, height = 12, dpi = 300)
    }
    
  #Topic proportion for each speeches
    relavant_topics <- c(7,10,12,15,16,17,19)
    a<-stm_model$theta   # document-topic distribution
    colnames(a)<-paste0("Topic ", seq(1:20))
    # Multiply all topic columns by 100 and calculate total
    topic.doc <- as.data.frame(a) %>%
      mutate(total = rowSums(., na.rm = TRUE))               # Sum all columns
    topic.doc <- topic.doc*100
    
    # Sum the relevant topics
    topic.doc$sum_relevant_topic <- rowSums(topic.doc[, relavant_topics])
    topic.doc <- topic.doc %>% mutate(sum_irrelevant_topic = 100-sum_relevant_topic)
    
    # Bind text and topic.doc, with an "all" column set to TRUE
    response_texts <- cbind(out$meta, topic.doc)
    austria_response_texts <-response_texts

## 4.3 STM: Sweden ============================================================    
  data_stm <- data_total %>% filter(country=="Sweden", immigration_agenda==1) 
  data_stm <- data_stm %>% 
    dplyr::select(country, election_id, party, speaker, year, 
                  populist, total_immi_prop, left_right, immigration_agenda, raw_text) %>% 
    mutate(docid = row_number())
  data_stm <- drop_na(data_stm)
  
  table(data_stm$populist)
  
  corp <- corpus(data_stm, text_field = "raw_text", docid_field = "docid", meta = TRUE)
  docvars(corp)$raw_text <- as.character(corp) # for stm (findThoughts)
  
  tokens <- tokens(corp,
                   what = "word",
                   remove_punct = TRUE, # removes punctuation
                   remove_symbols = TRUE, # removes unicode symbols
                   remove_numbers = TRUE, # removes numbers but not words starting with digits
                   remove_url = TRUE, # eliminates URLs beginning with http/https
                   remove_separators = TRUE, # removes separators as categorized in unicode
                   include_docvars = TRUE) # whether to pass on the docvars to the tokens object
  
  #Lower case
  doc_term_matrix <- dfm(tokens, tolower = TRUE)
  
  #Remove stopwords
  doc_term_matrix  <- dfm_select(doc_term_matrix, 
                                 pattern = stopwords("sv", source = "snowball"), 
                                 selection = "remove")
  
  #stem
  doc_term_matrix <- dfm_wordstem(doc_term_matrix)
  
  #trim to focus on common, but distinguishing features
  doc_term_matrix <- dfm_trim(doc_term_matrix, min_docfreq = 0.075,
                              max_docfreq = 0.90,
                              docfreq_type = "prop")
  
  #STM
    #filter out empty documents
    doc_term_matrix <- doc_term_matrix[ntoken(doc_term_matrix) > 0, ]
    doc_count <- ndoc(doc_term_matrix) #3595
  
  out<-quanteda::convert(doc_term_matrix, to="stm")
  
  #Number of K
  # storage <- searchK(documents = out$documents,
  #                    vocab = out$vocab,
  #                    data = out$meta,
  #                    K=c(5,10,15,20,25,30,35,40,45,50), #Can Change
  #                    prevalence =~ party + speaker + s(year)+ populist + total_immi_prop + left_right,
  #                    max.em.its = 500,
  #                    init.type = "Spectral", seed=9999, verbose =TRUE)
  # saveRDS(storage, "stm_model/sweden_storage.rds")
  
    #Visualization
    storage <-readRDS("stm_model/sweden_storage.rds")
    png("graph/searchK_Sweden.png", width = 800, height = 600)
    plot(storage)
    dev.off()

  #K=20 Looks good
    # stm_model <- stm(documents = out$documents,
    #                    vocab = out$vocab,
    #                    data = out$meta,
    #                    K=20,
    #                    prevalence =~ party + speaker + s(year)+ populist + total_immi_prop + left_right,
    #                    max.em.its = 500,
    #                    init.type = "Spectral", seed=9999, verbose =TRUE)
    #save(stm_model, file = "stm_model/final_sweden_stm_model.Rdata")
    load("stm_model/final_sweden_stm_model.Rdata")
  
    #Topic Distribution
    labelTopics(stm_model)
    plot(stm_model, type = "summary", xlim=c(0,0.3))
    
    #Key examples
    texts <-findThoughts(stm_model, texts = out$meta$raw_text, n=3, thresh = 0.1)
    example_texts<-texts[["docs"]]
    example_texts
    
    #Topic 5, 7, 11, 13, 15, 17, 19  is relevant
    prep_sweden<-estimateEffect(1:20~ populist+total_immi_prop+left_right,
                                stm_model, 
                                meta=out$meta, uncertainty="Global")
    summary(prep_sweden, topics=c(5,7,11,13,15,17, 19), nsim=1000)
    
    effect_sweden<-extract.estimateEffect(prep_sweden, "populist",
                                           cov.value1 = "1",cov.value2 = "0", 
                                           model = stm_model, method = "difference") 
    effect_sweden <- effect_sweden %>% filter(topic %in% c(5,7,11,13,15,17,19))  
    
    effect_sweden <- effect_sweden %>% dplyr::mutate(country = "Sweden",
                                                    topic_name = c("Labor Market Integration of Immigrant Women",
                                                                   "Local Burden and State Support",
                                                                   "Critique of Immigration Management",
                                                                   "Protection of Child & Family Immigrants",
                                                                   "EU Responsibility for Refugee Crisis",
                                                                   "Dispute over Refugee Integration Spending",
                                                                   "Labor Market Integration of Immigrants"
                                                                   ))
    
    prep_sweden2<-estimateEffect(1:20~ populist*total_immi_prop+left_right,
                                 stm_model, 
                                 meta=out$meta, uncertainty="Global")
    summary(prep_sweden2, topics=c(5,7,11,13,15,17,19), nsim=1000)
    
    out$meta<-out$meta %>% mutate(dummy = ifelse(year %in% c(2015,2016), 1, 0))
    prep_sweden3<-estimateEffect(1:20~ populist* total_immi_prop +left_right + dummy,
                                  stm_model, 
                                  meta=out$meta, uncertainty="Global")
    summary(prep_sweden3, topics=c(5,7,11,13,15,17,19), nsim=1000)
    
    for(k in c(3,2)){
      current_prep <- if(k == 2) prep_sweden2 else prep_sweden3
      
      effect_sweden2 <- lapply(c(0,1), function(i) {
        extract.estimateEffect(x = current_prep,
                               covariate = "total_immi_prop",
                               method = "continuous",
                               model = stm_model,
                               moderator = "populist",
                               moderator.value = i)
      })
      
      effect_sweden2 <- do.call("rbind", effect_sweden2)
      effect_sweden2 <- effect_sweden2 %>% filter(topic %in% c(5,7,11,13,15,17,19))
      effect_sweden2 <- effect_sweden2 %>% dplyr::mutate(country = "Sweden",
                                                         topic_name = case_when(
                                                           topic ==5 ~ "Labor Market Integration of Immigrant Women",
                                                           topic ==7 ~ "Local Burden and State Support",
                                                           topic ==11 ~"Critique of Immigration Management",
                                                           topic ==13 ~"Protection of Child & Family Immigrants",
                                                           topic ==15 ~"EU Responsibility for Refugee Crisis",
                                                           topic ==17 ~"Dispute over Refugee Integration Spending",
                                                           topic ==19 ~"Labor Market Integration of Immigrants"))
      
      fig3<- effect_sweden2 %>% ggplot(aes(x = covariate.value, y = estimate, ymin = ci.lower, ymax = ci.upper, 
                                           group = moderator.value, fill = factor(moderator.value)))+
        facet_wrap(~ reorder(topic_name,topic), ncol = 2, labeller = labeller(topic_name = label_wrap_gen(width = 50)))+
        geom_ribbon(alpha = .5) +
        geom_line(color = "red") +
        geom_line(aes(x = covariate.value, y = ci.lower), linetype="dashed")+
        geom_line(aes(x = covariate.value, y = ci.upper), linetype="dashed")+
        xlab("Sweden")+
        ylab("")+
        scale_fill_discrete(name = "Populism", labels = c("Non-populist", "Populist"))+
        theme_bw()+
        theme(strip.text = element_text(size = 10),
              legend.position = "bottom",
              legend.direction = "horizontal")
      ggsave(filename = paste0("graph/sweden_interaction_effect_topic", k, ".png"), fig3, width = 16, height = 12, dpi = 300)
    }
    
  #Topic proportion for each speeches
    relavant_topics <- c(5,7,11,13,15,17, 19)
    a<-stm_model$theta   # document-topic distribution
    colnames(a)<-paste0("Topic ", seq(1:20))
    # Multiply all topic columns by 100 and calculate total
    topic.doc <- as.data.frame(a) %>%
      mutate(total = rowSums(., na.rm = TRUE))               # Sum all columns
    topic.doc <- topic.doc*100
    
    # Sum the relevant topics
    topic.doc$sum_relevant_topic <- rowSums(topic.doc[, relavant_topics])
    topic.doc <- topic.doc %>% mutate(sum_irrelevant_topic = 100-sum_relevant_topic)
    
    # Bind text and topic.doc, with an "all" column set to TRUE
    response_texts <- cbind(out$meta, topic.doc)
    sweden_response_texts <-response_texts
    
## 4.4 STM: Denmark ============================================================    
  data_stm <- data_total %>% filter(country=="Denmark", immigration_agenda==1) 
  data_stm <- data_stm %>% 
    dplyr::select(country, election_id, party, speaker, year,
                  populist, total_immi_prop, left_right, immigration_agenda, raw_text) %>% 
    mutate(docid = row_number())
  data_stm <- drop_na(data_stm)
  table(data_stm$populist)
  
  corp <- corpus(data_stm, text_field = "raw_text", docid_field = "docid", meta = TRUE)
  docvars(corp)$raw_text <- as.character(corp) # for stm (findThoughts)
  
  tokens <- tokens(corp,
                   what = "word",
                   remove_punct = TRUE, # removes punctuation
                   remove_symbols = TRUE, # removes unicode symbols
                   remove_numbers = TRUE, # removes numbers but not words starting with digits
                   remove_url = TRUE, # eliminates URLs beginning with http/https
                   remove_separators = TRUE, # removes separators as categorized in unicode
                   include_docvars = TRUE) # whether to pass on the docvars to the tokens object
  
  #Lower case
  doc_term_matrix <- dfm(tokens, tolower = TRUE)
  
  #Remove stopwords
  doc_term_matrix  <- dfm_select(doc_term_matrix, 
                                 pattern = stopwords("da", source = "snowball"), 
                                 selection = "remove")
  
  #stem
  doc_term_matrix <- dfm_wordstem(doc_term_matrix)
  
  #trim to focus on common, but distinguishing features
  doc_term_matrix <- dfm_trim(doc_term_matrix, min_docfreq = 0.075,
                              max_docfreq = 0.90,
                              docfreq_type = "prop")
  
  #STM
    #filter out empty documents
    doc_term_matrix <- doc_term_matrix[ntoken(doc_term_matrix) > 0, ]
    doc_count <- ndoc(doc_term_matrix) #3595
  
  out<-quanteda::convert(doc_term_matrix, to="stm")
  
  #Number of K
  # storage <- searchK(documents = out$documents,
  #                    vocab = out$vocab,
  #                    data = out$meta,
  #                    K=c(5,10,15,20,25,30,35,40,45,50), #Can Change
  #                    prevalence =~ party + speaker + s(year)+ populist + total_immi_prop + left_right,
  #                    max.em.its = 500,
  #                    init.type = "Spectral", seed=9999, verbose =TRUE)
  # saveRDS(storage, "stm_model/denmark_storage.rds")
  
    #Visualization
    storage <-readRDS("stm_model/denmark_storage.rds")
    png("graph/searchK_Denmark.png", width = 800, height = 600)
    plot(storage)
    dev.off() 
    
  #K=20 Looks good
    # stm_model <- stm(documents = out$documents,
    #                    vocab = out$vocab,
    #                    data = out$meta,
    #                    K=20,
    #                    prevalence =~ party + speaker + s(year)+ populist + total_immi_prop + left_right,
    #                    max.em.its = 500,
    #                    init.type = "Spectral", seed=9999, verbose =TRUE)
    # save(stm_model, file = "stm_model/final_denmark_stm_model.Rdata")
    load("stm_model/final_denmark_stm_model.Rdata")
  
    #Topic Distribution
    labelTopics(stm_model)
    plot(stm_model, type = "summary", xlim=c(0,0.3))
    
    #Key examples
    texts <-findThoughts(stm_model, texts = out$meta$raw_text, n=5, thresh = 0.1)
    example_texts<-texts[["docs"]]
    example_texts
    
    #Topic 1,4,6,11,14,17,18  is relevant
    prep_denmark<-estimateEffect(1:20~ populist + total_immi_prop+left_right,
                                stm_model, 
                                meta=out$meta, uncertainty="Global")
    summary(prep_denmark, topics=c(1,4,6,11,14,17,18), nsim=1000)
    
    effect_denmark<-extract.estimateEffect(prep_denmark, "populist",
                                           cov.value1 = "1",cov.value2 = "0", 
                                           model = stm_model, method = "difference")
    
    effect_denmark <- effect_denmark %>% filter(topic %in% c(1,4,6,11,14,17,18))  
    
    effect_denmark <- effect_denmark %>% dplyr::mutate(country = "Denmark",
                                                       topic_name = c("Danish People's Party Immigration Positions",
                                                                      "Inter-party Challenges on Immigration Positions",
                                                                      "Critique of Liberal Immigration Positions",
                                                                      "Denmark’s Global Refugee Responsibility",
                                                                      "Labor Market Integration of Immigrants",
                                                                      "Legal Measures for Mass Arrival",
                                                                      "Local Burden and State Support"))
    
    prep_denmark2<-estimateEffect(1:20~ populist*total_immi_prop+left_right,
                                 stm_model, 
                                 meta=out$meta, uncertainty="Global")
    summary(prep_denmark2, topics=c(1,4,6,11,14,17,18), nsim=1000)
    
    out$meta<-out$meta %>% mutate(dummy = ifelse(year %in% c(2015,2016), 1, 0))
    prep_denmark3<-estimateEffect(1:20~ populist* total_immi_prop +left_right + dummy,
                                 stm_model, 
                                 meta=out$meta, uncertainty="Global")
    summary(prep_denmark3, topics=c(1,4,6,11,14,17,18), nsim=1000)
    
    for(k in c(3,2)){
      current_prep <- if(k == 2) prep_denmark2 else prep_denmark3
      
      effect_denmark2 <- lapply(c(0,1), function(i) {
        extract.estimateEffect(x = current_prep,
                               covariate = "total_immi_prop",
                               method = "continuous",
                               model = stm_model,
                               moderator = "populist",
                               moderator.value = i)
      })
      
      effect_denmark2 <- do.call("rbind", effect_denmark2)
      effect_denmark2 <- effect_denmark2 %>% filter(topic %in% c(1,4,6,11,14,17,18))
      effect_denmark2 <- effect_denmark2 %>% dplyr::mutate(country = "Denmark",
                                                           topic_name = case_when(
                                                             topic ==1  ~ "Danish People's Party Immigration Positions",
                                                             topic ==4 ~ "Inter-party Challenges on Immigration Positions",
                                                             topic ==6 ~ "Critique of Liberal Immigration Positions",
                                                             topic ==11 ~"Denmark’s Global Refugee Responsibility",
                                                             topic ==14 ~"Labor Market Integration of Immigrants",
                                                             topic ==17 ~"Legal Measures for Mass Arrival",
                                                             topic ==18 ~"Local Burden and State Support"))
      
      fig4<- effect_denmark2 %>% ggplot(aes(x = covariate.value, y = estimate, ymin = ci.lower, ymax = ci.upper, 
                                            group = moderator.value, fill = factor(moderator.value)))+
        facet_wrap(~ reorder(topic_name,topic), ncol = 2, labeller = labeller(topic_name = label_wrap_gen(width = 50)))+
        geom_ribbon(alpha = .5) +
        geom_line(color = "red") +
        geom_line(aes(x = covariate.value, y = ci.lower), linetype="dashed")+
        geom_line(aes(x = covariate.value, y = ci.upper), linetype="dashed")+
        xlab("Denmark")+
        ylab("")+
        scale_fill_discrete(name = "Populism", labels = c("Non-populist", "Populist"))+
        theme_bw()+
        theme(strip.text = element_text(size = 10),
              legend.position = "bottom",
              legend.direction = "horizontal")
      ggsave(filename = paste0("graph/denmark_interaction_effect_topic", k, ".png"), fig4, 
             width = 16, height = 12, dpi = 300)
    }
    
  #Topic proportion for each speeches
    relavant_topics <- c(1,4,6,11,14,17,18)
    a<-stm_model$theta   # document-topic distribution
    colnames(a)<-paste0("Topic ", seq(1:20))
    # Multiply all topic columns by 100 and calculate total
    topic.doc <- as.data.frame(a) %>%
      mutate(total = rowSums(., na.rm = TRUE))               # Sum all columns
    topic.doc <- topic.doc*100
    
    # Sum the relevant topics
    topic.doc$sum_relevant_topic <- rowSums(topic.doc[, relavant_topics])
    topic.doc <- topic.doc %>% mutate(sum_irrelevant_topic = 100-sum_relevant_topic)
    
    # Bind text and topic.doc, with an "all" column set to TRUE
    response_texts <- cbind(out$meta, topic.doc)
    denmark_response_texts <-response_texts
    
## 4.5 Visualization of STM ==================================================
  effect_all <- rbind(effect_austria, effect_germany, effect_sweden, effect_denmark)
  
  effect_all<-effect_all %>% mutate(country_num = case_when(
    country == "Austria" ~ 1,
    country == "Germany" ~ 3,
    country == "Sweden" ~ 4,
    country == "Denmark" ~ 2
  ))
  
  effect_all <- effect_all %>% arrange(country_num, topic)
 
  effect_all %>% 
    ggplot(aes(x = estimate, y=reorder(topic_name, -topic), xmin=ci.lower, xmax=ci.upper))+
    geom_point(size=1.5) +
    geom_errorbarh(height = 0.1) +
    #xlim(-0.06, 0.07)+
    xlab("")+
    ylab("")+
    geom_vline(xintercept = 0, color = "red") +
    scale_y_discrete(labels = label_wrap(40)) +
    facet_wrap(~ reorder(country, country_num), scales = "free_y", ncol = 2) +
    theme_bw()+
    theme(axis.text.y = element_text(size = 15),
          strip.text = element_text(size = 10))
  effect_all
  ggsave(filename = "graph/populist_effect_topic.png", width = 16, height = 12, dpi = 300)
  
  #Interaction term
  combined_plot2 <- (fig2 | fig4) +
    plot_layout(guides = "collect") &
    theme(legend.position = "none")
  combined_plot3 <- (fig1 | fig3) + 
    plot_layout(guides = "collect") &
    theme(legend.position = "bottom")
  
  # Display the combined plot
  combined_plot2
  combined_plot3
  
  ggsave("graph/all_interaction_effect_topic1.png",combined_plot2, width = 16, height = 12, dpi = 300)
  ggsave("graph/all_interaction_effect_topic2.png",combined_plot3, width = 16, height = 12, dpi = 300)
  
# 5. Sentiment analysis ========================================================
  austria_response_texts <- austria_response_texts %>% mutate(country = "Austria")
  denmark_response_texts <- denmark_response_texts %>% mutate(country = "Denmark")
  german_response_texts  <- german_response_texts %>% mutate(country = "Germany")
  sweden_response_texts  <- sweden_response_texts %>% mutate(country = "Sweden")
  
  all_response <- rbind(austria_response_texts, denmark_response_texts, german_response_texts, sweden_response_texts)
  
  saveRDS(all_response, "sentiment_analysis_python.rds")
  
  #Sentiment analysis would be on Python